/* CreateAndMergeData.do
	Create canonical versions of agency-level and program-level data sets for subsequent analysis for
	Ban, Pamela and Seth J. Hill. "Efficacy of Congressional Oversight." American Political Science Review.
*/


* Parameters for run.
* Set your working directory here.
cd "."
* Location of data files.
global DATALOC "../data"


***************************************************
******** Format raw hearings data (AGENCY) ********
***************************************************

* load data from ProQuest on improper payments hearings (hearing-parentagency level)
import delimited "$DATALOC/IP_hearings_agency_proquest.csv", varnames(1) clear

gen nhearings_house = 1 if chamber=="House"
gen nhearings_senate = 1 if chamber=="Senate"
replace nhearings_house = 0 if nhearings_house==.
replace nhearings_senate = 0 if nhearings_senate==.

* indicator for election years
gen electionp = 1 if year==2020 | year==2016 | year==2012 | year==2008 | year==2004 | year==2000
gen electionm = 1 if year==2018 | year==2014 | year==2010 | year==2006 | year==2002
replace electionp = 0 if electionp==.
replace electionm = 0 if electionm==.
sort congress hearingid
tempfile iphear
save `iphear', replace

* merge in number of witnesses in a hearing from Ban et al. 2024 replication dataset
use "$DATALOC/witness.dta", replace
gen wit = 1
collapse (sum) wit, by(hearingid congress)
rename wit tot_witness
sort congress hearingid
merge 1:m congress hearingid using `iphear'
keep if _merge==3
drop _merge

* collapse to fiscalyear-agency level
collapse (sum) nhearings_house nhearings_senate nwords tot_witness polappt nwitness_agency subcommittee (max) electionp electionm, by(fiscalyear parent agen_alltypes_hearings) 
format %12s parent
rename nwords tot_words /* note: there are still some missing transcripts (missing words) that go into this sum, from years before 2000 due to data availability */
replace tot_words = . if tot_words==0
gen nhearings_all = nhearings_house + nhearings_senate

rename parent agency
sort fiscalyear agency
tempfile tomerge
save `tomerge', replace

*** append the manually collected 2018-2021 hearings

import delimited "$DATALOC/proquest_hearings_witnesses_2018_2021.csv", varnames(1) clear
keep date committee parent nwords tot_witness polappt subcommittee

replace committee = lower(committee)
gen chamber = "House" if strpos(committee,"house")>0
replace chamber = "Senate" if strpos(committee,"senate")>0
gen nhearings_house = 1 if chamber=="House"
gen nhearings_senate = 1 if chamber=="Senate"
gen year = 2018 if substr(date,length(date)-2+1,2)=="18"
replace year = 2019 if substr(date,length(date)-2+1,2)=="19"
replace year = 2020 if substr(date,length(date)-2+1,2)=="20"
replace year = 2021 if substr(date,length(date)-2+1,2)=="21"

* fiscal year
gen month = substr(date,4,3) if substr(date,4,1)!="." & substr(date,2,1)!="-"
replace month = substr(date,1,3) if substr(date,4,1)=="."
replace month = substr(date,3,3) if substr(date,4,1)!="." & substr(date,2,1)=="-"
gen fiscalyear = year if month=="Jan" | month=="Feb" | month=="Mar" | month=="Apr" | month=="May" | month=="Jun"
replace fiscalyear = year+1 if month=="Jul" | month=="Aug" | month=="Sep" | month=="Oct" | month=="Nov" | month=="Dec"
drop month date
order year fiscalyear
drop if fiscalyear==2022
drop if fiscalyear==2018

* indicator for election years
gen electionp = 1 if year==2020
gen electionm = 1 if year==2018
replace electionp = 0 if electionp==.
replace electionm = 0 if electionm==.

rename parent agency
keep year fiscalyear agency nhearings_house nhearings_senate nwords tot_witness polappt subcommittee electionp electionm
gen nwitness_agency = 1
* collapse to fiscalyear-agency level
collapse (sum) nhearings_house nhearings_senate nwords tot_witness polappt nwitness_agency subcommittee (max) electionp electionm, by(fiscalyear agency)
rename nwords tot_words
replace tot_words = . if tot_words==0
replace tot_witness = . if tot_witness==0
format %12s agency
gen nhearings_all = nhearings_house + nhearings_senate

append using `tomerge'
format %12s agency

sort fiscalyear agency

* agen_alltypes_hearings: all hearings (ip and non-ip)
gen prop_hearings_ip = nhearings_all / agen_alltypes_hearings
drop agen_alltypes_hearings

export delimited using "$DATALOC/hearings_agency.csv", replace


****************************************
******** Format reports data ***********
****************************************

import delimited "$DATALOC/reports_ip.csv", varnames(1) clear 


* reshape wide to long
split(agencies), p(";")
keep report_id dateissued agencies1-agencies14
reshape long agencies, i(report_id dateissued) j(num)
drop if agencies==""

* get fiscal year
gen x = strpos(dateissued,"/")
gen month = substr(dateissued,1,x-1)
gen year = substr(dateissued,length(dateissued)-1,2)
replace year = "20"+year
destring month, replace
destring year, replace
* U.S. federal government's fiscal year 2023 runs from oct 1, 2022 - sept 30, 2023
gen fiscalyear = year if month>=1 & month<=9
replace fiscalyear = year+1 if month>=10 & month<=12
drop x num month year dateissued report_id

rename agencies agency
replace agency = trim(agency)
order fiscalyear agency
* only need one obs of an agency per fiscalyear
contract fiscalyear agency
drop _freq

* clean agency names to match names in our IP dataset
*contract agency
replace agency = "usda" if agency=="Department of Agriculture"
replace agency = "usda" if agency=="Department of Agriculture, Rural Development"
replace agency = "usda" if agency=="Rural Development"
replace agency = "usda" if agency=="Forest Service"
replace agency = "usda" if agency=="Department of Rural Development"
replace agency = "doc" if agency=="Department of Commerce"
replace agency = "dod" if agency=="Department of Defense"
replace agency = "dod" if agency=="Department of Defense (Civil)"
replace agency = "ed" if agency=="Department of Education"
replace agency = "ed" if agency=="Department of Education, Related Agencies"
replace agency = "hhs" if agency=="Department of Health and Human Services"
replace agency = "hhs" if agency=="Food and Drug Administration"
replace agency = "hhs" if agency=="Food and Drug Administration"
replace agency = "hhs" if agency=="Indian Health Service"
replace agency = "dhs" if agency=="Department of Homeland Security"
replace agency = "hud" if agency=="Department of Housing and Urban Development"
replace agency = "hud" if agency=="Housing and Urban Development"
replace agency = "doj" if agency=="Department of Justice"
replace agency = "dol" if agency=="Department of Labor"
replace agency = "dot" if agency=="Department of Transportation"
replace agency = "dot" if agency=="Federal Aviation Administration"
replace agency = "va" if agency=="Department of Veterans Affairs"
replace agency = "doi" if agency=="Department of the Interior"
replace agency = "treasury" if agency=="Department of the Treasury"
replace agency = "treasury" if agency=="Tresury Department, Postal Service, Executive Office of the President, Independent Agencies"
replace agency = "epa" if agency=="Environmental Protection Agency"
replace agency = "Office of Management and Budget" if agency=="Executive Office of the President"

gen report = 1
contract fiscalyear agency report
drop _freq
sort fiscalyear agency

tempfile reports
save `reports', replace


***** AGENCIES DIRECTLY MENTIONED ABOUT IP, IN REPORTS MENTIONING IP

import delimited "$DATALOC/reports_ip.csv", varnames(1) clear

* reshape wide to long
split(agencies_ip), p(";")
keep report_id dateissued agencies_ip1 agencies_ip2
reshape long agencies_ip, i(report_id dateissued) j(num)
drop if agencies_ip==""

* get fiscal year
gen x = strpos(dateissued,"/")
gen month = substr(dateissued,1,x-1)
gen year = substr(dateissued,length(dateissued)-1,2)
replace year = "20"+year
destring month, replace
destring year, replace
* U.S. federal government's fiscal year 2023 runs from oct 1, 2022 - sept 30, 2023
gen fiscalyear = year if month>=1 & month<=9
replace fiscalyear = year+1 if month>=10 & month<=12
drop x num month year dateissued report_id

rename agencies_ip agency
replace agency = trim(agency)
order fiscalyear agency
* only need one obs of an agency per fiscalyear
contract fiscalyear agency
drop _freq

* clean agency names to match names in our IP dataset
*contract agency
replace agency = "usda" if agency=="Department of Agriculture"
replace agency = "usda" if agency=="Department of Agriculture (Food and Nutrition Service)"
replace agency = "usda" if agency=="Food and Nutrition Service"
replace agency = "dod" if agency=="Department of Defense"
replace agency = "ed" if agency=="Department of Education"
replace agency = "hhs" if agency=="Department of Health and Human Services (Centers for Medicare and Medicaid Services)"
replace agency = "dol" if agency=="Department of Labor"
replace agency = "dol" if agency=="Department of Labor (Employment and Training Administration"
replace agency = "va" if agency=="Department of Veterans Affairs"
replace agency = "doi" if agency=="Department of the Interior (Disaster Relief)"
replace agency = "dot" if agency=="Federal Aviation Administration"
replace agency = "hud" if agency=="Housing and Urban Development"
replace agency = "treasury" if agency=="Internal Revenue Service"
replace agency = "treasury" if agency=="Internal Revenue Service (EITC program)"
replace agency = "dol" if agency=="Office of Workers' Compensation Programs)"
replace agency = "Department of Energy" if agency=="Department of Energy (National Nuclear Security Administration)"

gen report_ip = 1
contract fiscalyear agency report_ip
drop _freq
sort fiscalyear agency
merge 1:1 fiscalyear agency using `reports'
drop _merge
sort fiscalyear agency
save `reports', replace


***** AGENCIES NEUTRALLY MENTIONED ABOUT IP, IN REPORTS MENTIONING IP

import delimited "$DATALOC/reports_ip.csv", varnames(1) clear


* reshape wide to long
split(agencies_ip_neutral), p(";")
keep report_id dateissued agencies_ip_neutral1 agencies_ip_neutral2
reshape long agencies_ip_neutral, i(report_id dateissued) j(num)
drop if agencies_ip_neutral==""

* get fiscal year
gen x = strpos(dateissued,"/")
gen month = substr(dateissued,1,x-1)
gen year = substr(dateissued,length(dateissued)-1,2)
replace year = "20"+year
destring month, replace
destring year, replace
* U.S. federal government's fiscal year 2023 runs from oct 1, 2022 - sept 30, 2023
gen fiscalyear = year if month>=1 & month<=9
replace fiscalyear = year+1 if month>=10 & month<=12
drop x num month year dateissued report_id

rename agencies agency
replace agency = trim(agency)
order fiscalyear agency
* only need one obs of an agency per fiscalyear
contract fiscalyear agency
drop _freq

* clean agency names to match names in our IP dataset
*contract agency
replace agency = "doi" if agency=="Bureau of Indian Affairs"
replace agency = "doi" if agency=="Bureau of Indian Affairs (Office of Special Trustee for American Indians)"
replace agency = "doi" if agency=="Bureau of Indian Affairs (Office of the Special Trustee for American Indians)"
replace agency = "doi" if agency=="Department of the Interior (Disaster Relief)"
replace agency = "doi" if agency=="Office of the Special Trustee for American Indians"
replace agency = "treasury" if agency=="Bureau of the Fiscal Service"
replace agency = "treasury" if agency=="Internal Revenue Service"
replace agency = "treasury" if agency=="Department of the Treasury"
replace agency = "usda" if agency=="Department of Agriculture (Food and Nutrition Service)"
replace agency = "usda" if agency=="Risk Management Agency"
replace agency = "hhs" if agency=="Department of Health and Human Services (Centers for Medicare and Medicaid Services)"
replace agency = "hhs" if agency=="Department of Health and Human Services (Social Security Administration)"
replace agency = "hud" if agency=="Department of Housing and Urban Development"
replace agency = "hud" if agency=="Housing and Urban Development"
replace agency = "dol" if agency=="Department of Labor"
replace agency = "dot" if agency=="Department of Transportation"
replace agency = "dot" if agency=="Department of Transportation, Housing and Urban Development"

gen report_ip_neutral = 1
contract fiscalyear agency report_ip_neutral
drop _freq
sort fiscalyear agency
merge 1:1 fiscalyear agency using `reports'
drop _merge
sort fiscalyear agency
save `reports', replace

replace report_ip=0 if report_ip==.
replace report_ip_neutral=0 if report_ip_neutral==.

* keep only agencies that are agencies in our IP data
keep if length(agency)<=4 | agency=="treasury"

gen report_ip2 = 1 if report_ip==1 | report_ip_neutral==1
replace report_ip2 = 0 if report_ip2==.

label variable report "Approp. report"
label variable report_ip2 "Approp. report direct mention"


* generate variable for FIRST REPORT
sort agency fiscalyear
egen firstyear = min(fiscalyear), by(agency)
gen report_first = 1 if fiscalyear==firstyear & report==1
replace report_first = 0 if report_first==.

egen firstyear_ip2 = min(fiscalyear), by(agency report_ip2)
gen report_ip2_first = 1 if fiscalyear==firstyear_ip2 & report_ip2==1
replace report_ip2_first = 0 if report_ip2_first ==.

gen reportXfirst = report*report_first
gen report_ip2Xfirst = report_ip2*report_ip2_first
label var reportXfirst "Approp. report * First Approp. report"
label var report_ip2Xfirst "Approp. report direct mention * First Approp. report direct mention"
label var report_first "First Approp. report"
label var report_ip2_first "First Appropr. report direct mention"

drop firstyear firstyear_ip2 report_ip report_ip_neutral

sort fiscalyear agency
order fiscalyear agency report report_ip2 report_first report_ip2_first

save "$DATALOC/reports_ip_clean.dta", replace




********************************
******** Prep IP data **********
********************************

import delimited "$DATALOC/payments_all.csv", varnames(1) clear
sort fiscalyear agency
tempfile ip
save `ip', replace
* get first and last year of reporting for an agency, for descriptive table
egen firstyear = min(fiscalyear), by(agency)
egen lastyear = max(fiscalyear), by(agency)
contract agency firstyear lastyear



****************************************
******** Hearings data (AGENCY) ********
****************************************


* construct variable for # years since last hearing (for that agency)
* loading file created earlier in 01_CreateAndMergeData.do:
import delimited "$DATALOC/hearings_agency.csv", varnames(1) clear
sort agency fiscalyear
* fill in missing years (zeros) - need to declare tsset format first
encode agency, gen(agencyid)
tsset agencyid fiscalyear
* tsfill to add new observations with missing values for missing time periods (strongly balanced option)
tsfill, full
decode agencyid, gen(agencyname)
order fiscalyear agency agencyid
replace agency = agencyname
drop agencyname

* Prepend hearing variables with `agen' qualifier.
rename nhearings_house agen_nhear_h
rename nhearings_senate agen_nhear_s
rename nhearings_all agen_nhear_all
rename electionm agen_electionm_all
rename electionp agen_electionp_all
rename prop_hearings_ip agen_prop_all
foreach var in agen_nhear_all agen_nhear_h agen_nhear_s tot_words tot_witness polappt nwitness_agency subcommittee agen_electionm_all agen_electionp_all agen_prop_all {
	qui replace `var' = 0 if `var'==.
}

* agen_prop_hear is truly missing for 2018 onwards because we do not have the
* denominator for all hearings. we manually collected the numerator.
replace agen_prop_all = . if fiscalyear>=2018

* Create moderators and lags.
gen agen_nwords_all = log((tot_words+1))
rename tot_witness agen_nwitness_all
rename polappt agen_polappt_all
rename nwitness_agency agen_nwitnessagency_all
rename subcommittee agen_subcom_all
drop tot_words
label var agen_nwitness_all "Number of Witnesses, This Fiscal Year"
label var agen_polappt_all "Number of Political Agency Witnesses, This Fiscal Year"
label var agen_nwitnessagency_all "Number of Agency Witnesses, This Fiscal Year"
label var agen_nwords_all "Length of Hearing Transcripts (log words), This Fiscal Year"
label var agen_subcom_all "Subcommittee Hearing, This Fiscal Year"
label var agen_electionm_all "Midterm Election Year, This Fiscal Year"
label var agen_electionp_all "Pres. Election year, This Fiscal Year"
label var agen_prop_all "Prop. Hearings on IP, This Fiscal Year"

* Lags
foreach var of varlist agen_nwords_all agen_nwitness_all agen_nwitnessagency_all agen_polappt_all agen_subcom_all agen_electionm_all agen_electionp_all {
	local vlabel: var label `var'
	forvalues l=1/4 {
		qui g `var'_lag`l' = L`l'.`var'
		local this_label = "`vlabel', t-`l'"
		label var `var'_lag`l' "`this_label'"
	}
}

* construct lagged hearings
gen agen_nhear_all_lag1 = l1.agen_nhear_all
gen agen_nhear_all_lag2 = l2.agen_nhear_all
gen agen_nhear_all_lag3 = l3.agen_nhear_all
gen agen_nhear_all_lag4 = l4.agen_nhear_all
gen agen_nhear_h_lag1 = l1.agen_nhear_h
gen agen_nhear_h_lag2 = l2.agen_nhear_h
gen agen_nhear_h_lag3 = l3.agen_nhear_h
gen agen_nhear_s_lag1 = l1.agen_nhear_s
gen agen_nhear_s_lag2 = l2.agen_nhear_s
gen agen_nhear_s_lag3 = l3.agen_nhear_s
gen agen_prop_all_lag1 = l1.agen_prop_all
gen agen_prop_all_lag2 = l2.agen_prop_all
gen agen_prop_all_lag3 = l3.agen_prop_all
gen agen_prop_all_lag4 = l4.agen_prop_all

* construct binary versions
gen agen_hear_all = 1 if agen_nhear_all>0
gen agen_hear_all_lag1 = 1 if agen_nhear_all_lag1>0
gen agen_hear_all_lag2 = 1 if agen_nhear_all_lag2>0
gen agen_hear_all_lag3 = 1 if agen_nhear_all_lag3>0
gen agen_hear_all_lag4 = 1 if agen_nhear_all_lag4>0
gen agen_hear_h = 1 if agen_nhear_h>0
gen agen_hear_h_lag1 = 1 if agen_nhear_h_lag1>0
gen agen_hear_h_lag2 = 1 if agen_nhear_h_lag2>0
gen agen_hear_h_lag3 = 1 if agen_nhear_h_lag3>0
gen agen_hear_s = 1 if agen_nhear_s>0
gen agen_hear_s_lag1 = 1 if agen_nhear_s_lag1>0
gen agen_hear_s_lag2 = 1 if agen_nhear_s_lag2>0
gen agen_hear_s_lag3 = 1 if agen_nhear_s_lag3>0
foreach var in agen_hear_all agen_hear_all_lag1 agen_hear_all_lag2 agen_hear_all_lag3 agen_hear_all_lag4 agen_hear_h agen_hear_h_lag1 agen_hear_h_lag2 agen_hear_h_lag3 agen_hear_s agen_hear_s_lag1 agen_hear_s_lag2 agen_hear_s_lag3 {
	qui replace `var' = 0 if `var'==.
}

* generate variables for # years since last hearing (for that agency)
sort agency fiscalyear
* Any chamber.
* 0 years since last hearing if there is a hearing this year
gen last_agen_hear = 0 if agen_nhear_all>0
* increment by 1 year IF if there is NOT a hearingthis year
replace last_agen_hear = last_agen_hear[_n-1] + 1 if agency==agency[_n-1] & agen_nhear_all==0
* missing values of last_hear are before we observed any agen_hearing in the full time period
* SET TO ZERO since agen_hearing variable would also equal 0 for these cases
replace last_agen_hear = 0 if last_agen_hear==.
* Recode to zero if last yearing more than 20 years ago.
replace last_agen_hear = 0 if last_agen_hear > 20


* generate variable for cumulative sum of hearings (for that agency)
sort agency fiscalyear
gen agen_csum_nhear = agen_nhear_all
gen agen_csum_nhearb = 1 if agen_nhear_all>0
replace agen_csum_nhearb = 0 if agen_nhear_all==0
* increment by # hearings that year
replace agen_csum_nhear = agen_csum_nhear + agen_csum_nhear[_n-1] if agency==agency[_n-1]
* increment by 1 if hearing that year
replace agen_csum_nhearb = agen_csum_nhearb + agen_csum_nhearb[_n-1] if agency==agency[_n-1]
* previous sum of hearings (excluding this year)
gen agen_prevsum_nhear = agen_csum_nhear - agen_nhear_all
gen agen_prevsum_nhearb = agen_csum_nhearb - 1 if agen_nhear_all >0
replace agen_prevsum_nhearb = agen_csum_nhearb if agen_nhear_all==0


* House.
* 0 years since last hearing if there is a hearing this year
gen last_agen_hear_h = 0 if agen_nhear_h>0
* increment by 1 year IF if there is NOT a hearingthis year
replace last_agen_hear_h = last_agen_hear_h[_n-1] + 1 if agency==agency[_n-1] & agen_nhear_h==0
* missing values of last_hear are before we observed any agen_hearing in the full time period
* SET TO ZERO since agen_hearing variable would also equal 0 for these cases
replace last_agen_hear_h = 0 if last_agen_hear_h==.
* Recode to zero if last yearing more than 20 years ago.
replace last_agen_hear_h = 0 if last_agen_hear_h > 20

* Senate.
* 0 years since last hearing if there is a hearing this year
gen last_agen_hear_s = 0 if agen_nhear_s>0
* increment by 1 year IF if there is NOT a hearingthis year
replace last_agen_hear_s = last_agen_hear_s[_n-1] + 1 if agency==agency[_n-1] & agen_nhear_s==0
* missing values of last_hear are before we observed any agen_hearing in the full time period
* SET TO ZERO since agen_hearing variable would also equal 0 for these cases
replace last_agen_hear_s = 0 if last_agen_hear_s==.
* Recode to zero if last yearing more than 20 years ago.
replace last_agen_hear_s = 0 if last_agen_hear_s > 20

* Unified vs divided government.
qui g unified = inlist(fiscalyear,2021,2018,2017,2010,2009,2006,2005,2004,2003)
qui g agen_hearXuni = agen_hear_all*unified
label var agen_hearXuni "Agency Hearing * Unified Government"
* generate variables for # years since last unified hearing.
sort agency fiscalyear
* 0 years since last hearing if there is a hearing this year
gen last_agenXuni = 0 if agen_hearXuni == 1
* increment by 1 year IF if there is NOT a hearingthis year
replace last_agenXuni = last_agenXuni[_n-1] + 1 if agency==agency[_n-1] & agen_hearXuni==0
* missing values of last_hear are before we observed any agen_hearing in the full time period
* SET TO ZERO since agen_hearing variable would also equal 0 for these cases
replace last_agenXuni = 0 if last_agenXuni ==.
* Recode to zero if last yearing more than 20 years ago.
replace last_agenXuni = 0 if last_agenXuni > 20
label var last_agenXuni "Years Since Last Unified Govt Hearing"
tab last_agen_hear last_agenXuni



* PCA of alternative predictors
* gen scale for oversight prioritization using pca on num witnesses, num words, num hearings, prop. of hearings
sort agencyid fiscalyear
levelsof agencyid, local(agencyids)
gen agen_pc1_all = .
foreach i of local agencyids {
	pca agen_nwitness_all agen_nwords_all agen_nhear_all agen_prop_all if agencyid == `i'
	rotate
	predict agen_pc1_thisagency
	replace agen_pc1_all = agen_pc1_thisagency if agencyid==`i'
	drop agen_pc1_thisagency
}
label var agen_pc1_all "Priority Score, This Fiscal Year"


*
* Generate Years since last hearing * moderator interactions.
*

tempfile full
save `full', replace
* Create agency-fiscal year data set of moderator variables.
keep agencyid fiscalyear agen_nwitness_all agen_nwords_all agen_nwitnessagency_all agen_polappt_all agen_subcom_all agen_electionm_all agen_electionp_all agen_pc1_all agen_prop_all
* Rename fiscalyear to merging variable `fy_last_hring'
rename fiscalyear fy_last_hring
* Rename moderators.
foreach var of varlist *_all {
	local newname = subinstr("`var'","_all","_last_hr",.)
	rename `var' `newname'
}
label var agen_nwitness_last_hr "Number of Witnesses, Fiscal Year of Last Hearing"
label var agen_polappt_last_hr "Number of Political Agency Witnesses, Fiscal Year of Last Hearing"
label var agen_nwitnessagency_last_hr "Number of Agency Witnesses, Fiscal Year of Last Hearing"
label var agen_nwords_last_hr "Length of Hearing Transcripts (log words), Fiscal Year of Last Hearing"
label var agen_subcom_last_hr "Subcommittee Hearing, Fiscal Year of Last Hearing"
label var agen_electionm_last_hr "Midterm Election Year, Fiscal Year of Last Hearing"
label var agen_electionp_last_hr "Pres. Election Year, Fiscal Year of Last Hearing"
label var agen_pc1_last_hr "Priority Score, Fiscal Year of Last Hearing"
label var agen_prop_last_hr "Prop. Hearings on IP, Fiscal Year of Last Hearing"


sort agencyid fy_last_hring
tempfile modrs
save `modrs', replace

* Recall full data set.
use `full', clear
* Calculate fiscal year of last hearing for each agency-fiscal year.
qui g fy_last_hring = fiscalyear - last_agen_hear
summ fy_last_hring
* Merge in moderators for each record's fiscal year of last hearing.
joinby agencyid fy_last_hring using `modrs', unmatched(master)
tab fiscalyear _merge
drop _merge 
* Check
li fiscalyear agen_nhear_all last_agen_hear fy_last_hring agen_nwords_all agen_nwords_last_hr agen_subcom_last_hr agen_prop_last_hr if agency == "usda"

* Years since last hearing * moderator interactions.
foreach var of varlist agen*_last_hr {
	qui g lX`var'  = last_agen_hear*`var'
}
* Check
li fiscalyear agen_nhear_all last_agen_hear fy_last_hring agen_nwords_all lXagen_nwords_last_hr if agency == "usda"
* Label
label var lXagen_nwitness_last_hr "Years Since Last * Number Witnesses, Fiscal Year of Last"
label var lXagen_nwitnessagency_last_hr "Years Since Last * Number Agency Witnesses, Fiscal Year of Last"
label var lXagen_nwords_last_hr "Years Since Last * Length of Hearing Transcripts, Fiscal Year of Last"
label var lXagen_polappt_last_hr "Years Since Last * Agency Political Appointee Witnesses, Fiscal Year of Last"
label var lXagen_subcom_last_hr "Years Since Last * Subcommittee Hearing, Fiscal Year of Last"
label var lXagen_electionm_last_hr "Years Since Last * Midterm Election Year, Fiscal Year of Last"
label var lXagen_electionp_last_hr "Years Since Last * Pres. Election Year, Fiscal Year of Last"
label var lXagen_pc1_last_hr "Years Since Last * Priority Score, Fiscal Year of Last"
label var lXagen_prop_last_hr "Years Since Last * Prop. Hearings on IP, Fiscal Year of Last"
desc lX*

* Generate hearing * statute year interactions
qui g statute = inlist(fiscalyear,2001,2002,2010,2012,2019)
qui g agen_hearXstatute = agen_hear_all*statute
label var agen_hearXstatute "Agency Hearing * Statute Year"


********************
* Sort and label
********************
order fiscalyear agency last_agen_hear
drop agencyid

* Labels.
label variable agen_hear_all "Agency Hearing"
label variable agen_hear_all_lag1 "Agency Hearing t-1"
label variable agen_hear_all_lag2 "Agency Hearing t-2"
label variable agen_hear_all_lag3 "Agency Hearing t-3"
label variable agen_hear_all_lag4 "Agency Hearing t-4"
label variable agen_nhear_all "Number Agency Hearings"
label variable agen_nhear_all_lag1 "Number Agency Hearings t-1"
label variable agen_nhear_all_lag2 "Number Agency Hearings t-2"
label variable agen_nhear_all_lag3 "Number Agency Hearings t-3"
label variable agen_nhear_all_lag4 "Number Agency Hearings t-4"
label variable agen_hear_h "House Agency Hearing"
label variable agen_hear_h_lag1 "House Agency Hearing t-1"
label variable agen_hear_h_lag2 "House Agency Hearing t-2"
label variable agen_hear_h_lag3 "House Agency Hearing t-3"
label variable agen_hear_s "Senate Agency Hearing"
label variable agen_hear_s_lag1 "Senate Agency Hearing t-1"
label variable agen_hear_s_lag2 "Senate Agency Hearing t-2"
label variable agen_hear_s_lag3 "Senate Agency Hearing t-3"
label variable agen_nhear_h "Number House Agency Hearings"
label variable agen_nhear_s "Number Senate Agency Hearings"
label variable agency "Agency"
label variable fiscalyear "Fiscal Year"
label variable last_agen_hear "Years Since Last Agency Hearing"
label variable last_agen_hear_h "Years Since Last House Agency Hearing"
label variable last_agen_hear_s "Years Since Last Senate Agency Hearing"
label variable agen_prevsum_nhear "Previous Number Agency Hearings"
label variable agen_prevsum_nhearb "Previous Agency Hearings"

sort fiscalyear agency
tempfile hearings_agency
save `hearings_agency', replace



*******************************************
******** Hearings data (SPILLOVER) ********
*******************************************


* for each committee, use set of agencies called in (more than once)
* for oversight hearings from 1990-2000

import delimited using "$DATALOC/adjacent_committees.csv", varnames(1) clear
tempfile adjacent
save `adjacent', replace

* start with IP hearings
import delimited using "$DATALOC/IP_hearings_agency_proquest.csv", clear
keep if fiscalyear>=2000
contract fiscalyear cmt_govtrack
drop _freq
* add manually collected 2019-2021 IP hearings with their cmt_govtrack manually
set obs 53
replace fiscalyear = 2019 in 47
replace cmt_govtrack = "Senate Committee on Government Operations" in 47
replace fiscalyear = 2019 in 48
replace cmt_govtrack = "Senate Committee on Agriculture and Forestry" in 48
replace fiscalyear = 2020 in 49
replace cmt_govtrack = "House Committee on Veterans' Affairs" in 49
replace fiscalyear = 2020 in 50
replace cmt_govtrack = "Senate Committee on Finance" in 50
replace fiscalyear = 2021 in 51
replace cmt_govtrack = "House Committee on Small Business" in 51
replace fiscalyear = 2021 in 52
replace cmt_govtrack = "Senate Committee on Finance" in 52
replace fiscalyear = 2021 in 53
replace cmt_govtrack = "Senate Committee on Small Business (Select)" in 53

sort fiscalyear cmt_govtrack
* for a committee holding an IP hearing, match to its adjacent committees 
joinby cmt_govtrack using `adjacent'

gen hear_spillover=1
rename parent agency
sort fiscalyear agency
quietly by fiscalyear agency: gen dup = cond(_N==1,0,_n)
contract fiscalyear agency hear_spillover
drop _freq
sort fiscalyear agency

* keep agencies used in our analysis
keep if length(agency)<=3 | agency=="treasury" | agency=="usaid" | agency=="usda"
drop if agency=="CBP" | agency=="EOP" | agency=="FBI"

sort agency fiscalyear
* fill in missing years (zeros) - need to declare tsset format first
encode agency, gen(agencyid)
tsset agencyid fiscalyear
* tsfill to add new observations with missing values for missing time periods (strongly balanced option)
tsfill, full
decode agencyid, gen(agencyname)
order fiscalyear agency agencyid
replace agency = agencyname
drop agencyname

* Prepend hearing variables with `agen' qualifier.
rename hear_spillover agen_hear_spill
qui replace agen_hear_spill = 0 if agen_hear_spill==.

label var agen_hear_spill "Hearing Spillover"

* Lags
local vlabel: var label agen_hear_spill
forvalues l=1/4 {
		qui g agen_hear_spill_lag`l' = L`l'.agen_hear_spill
		local this_label = "`vlabel', t-`l'"
		label var agen_hear_spill_lag`l' "`this_label'"
	}

* generate variables for # years since last hearing (for that agency)
sort agency fiscalyear
* Any chamber.
* 0 years since last hearing if there is a hearing this year
gen last_agen_hear_spill = 0 if agen_hear_spill>0
* increment by 1 year IF if there is NOT a hearing this year
replace last_agen_hear_spill = last_agen_hear_spill[_n-1] + 1 if agency==agency[_n-1] & agen_hear_spill==0
* missing values of last_hear are before we observed any agen_hearing in the full time period
* SET TO ZERO since agen_hearing variable would also equal 0 for these cases
replace last_agen_hear_spill = 0 if last_agen_hear_spill==.
* Recode to zero if last yearing more than 20 years ago.
replace last_agen_hear_spill = 0 if last_agen_hear_spill > 20
		
order fiscalyear agency last_agen_hear_spill
drop agencyid	

label variable last_agen_hear "Years Since Last Hearing Spillover"

rename fiscalyear FY
sort FY agency
tempfile hearings_agency_spill
save `hearings_agency_spill', replace



********************************
****** AGENCY-LEVEL MERGE ******
********************************

use `ip', replace
destring outlaysamountm, force replace
destring improperpaymentamountm, force replace
* drop if it was NA for outlays and/or IP amount
drop if outlaysamountm==. | improperpaymentamountm==.
*gen check = improperpaymentamountm / outlaysamountm
collapse (sum) improperpaymentamountm outlaysamountm, by(fiscalyear agency)
* agency-level IP rate as percent
gen iprate = 100*improperpaymentamountm / outlaysamountm
label variable iprate "IP Rate (percent)"  
summ iprate

* Merge agency-level hearings.
sort fiscalyear agency
merge 1:1 fiscalyear agency using `hearings_agency'
keep if _merge==1 | _merge==3
drop _merge

* Set to zero non-merges.
foreach var of varlist agen_* last_agen* lX*  {
	qui replace `var' = 0 if `var'==.
}


* merge in appropriations report data
sort fiscalyear agency
merge 1:1 fiscalyear agency using "$DATALOC/reports_ip_clean.dta"
drop if _merge==2
foreach var of varlist report report_ip2 report_first report_ip2_first reportXfirst report_ip2Xfirst {
	replace `var' = 0 if _merge==1
}
drop _merge

* merge in agency budget data
sort fiscalyear agency
merge 1:1 fiscalyear agency using "$DATALOC/budget_agency.dta"
drop if _merge==2
drop _merge
replace ba = log(ba+1)
replace dba = log(dba+1)
rename ba agen_ba
rename dba agen_dba
label variable agen_ba "log Agency Budget Authority"
label variable agen_dba "log Agency Discretionary Budget Authority"


* Panel variables.
rename fiscalyear FY
encode agency, gen(agencyid)
tsset agencyid FY

* Lagged values of appropriations reports
foreach var of varlist report report_ip2 {
	local vlabel: var label `var'
	qui g `var'_lag1 = L1.`var'
	local this_label = "`vlabel', t-1"
	label var `var'_lag1 "`this_label'"
}

* Lagged values of iprate (to look for pretrends)
gen iprate_lag1 = l1.iprate
gen iprate_lag2 = l2.iprate
gen iprate_lag3 = l3.iprate
* Log IP dollars.
qui g l1_log_ips = log(L1.improperpaymentamountm )
label var l1_log_ips "Log improper payment dollars, t-1"
gen log_ips = log(improperpaymentamountm)
label var log_ips "Log improper payment dollars"

* merge in hearing spillover
sort FY agency
merge 1:1 FY agency using `hearings_agency_spill'
keep if _merge==1 | _merge==3
drop _merge
* Set to zero non-merges.
foreach var of varlist agen_hear_spill* last_agen_hear_spill {
	qui replace `var' = 0 if `var'==.
}


tempfile amaster
save `amaster', replace
* generate indicator for above/below 50th percentile program mean IP
collapse (mean) Mean_IP=iprate (sum) ip_dollars=improperpaymentamountm outlays=outlaysamountm if FY > 2004, by(agency)
sum Mean_IP, detail
* 50th percentile mean IP: 1.211
gen agen_above50 = 1 if Mean_IP > 1.211
label var agen_above50 "Agency IP rate above 50th percentile"
replace agen_above50 = 0 if agen_above50==.
keep agency agen_above50
sort agency
tempfile agency_50
save `agency_50', replace
use `amaster', replace
merge m:1 agency using `agency_50'
drop _merge

qui g last_agen_2_plus = (last_agen_hear > 1 & last_agen_hear != .)
label variable last_agen_2_plus "Agency Hearing Prior to this Year"


* +++++++++++++++++++
* Determine intersection of current variables in memory and variables needed for analysis
* to produce smaller and simpler analysis data file.
* +++++++++++++++++++
* Retrieve the list of variables currently in memory
describe, varlist
local vars_in_memory `r(varlist)'

* Variables needed for analysis.
local external_vars "iprate agen_nhear_all agen_hear_all last_agen_hear agency FY agen_hear_h agen_hear_s last_agen_hear_h last_agen_hear_s agen_hearXuni last_agenXuni programname agen_nwords_all lXagen_nwords_last_hr agen_nwitnessagency_all lXagen_nwitnessagency_last_hr agencyid log_ip_dollars log_ip_rate L1_log_ip_rate L2_log_ip_rate L3_log_ip_rate L1_log_ip_dollars L2_log_ip_dollars L3_log_ip_dollars hearingXyear first_hearing has_had_hearing last_agen_2_plus agen_hear_all_lag1 agen_hear_all_lag2 agen_hear_all_lag3 agen_hear_all_lag4 programid agen_above50 program_above50 agen_prevsum_nhearb agen_hear_spill last_agen_hear_spill agen_hearXstatute agen_prop_all lXagen_prop_last_hr agen_ba agen_dba agen_nwitness_all lXagen_nwitness_last_hr agen_polappt_all lXagen_polappt_last_hr agen_subcom_all lXagen_subcom_last_hr agen_electionp_all lXagen_electionp_last_hr report report_ip2 reportXfirst report_first report_ip2Xfirst report_ip2_first report_lag1 report_ip2_lag1 fiscalyear outlaysamountm improperpaymentamountm improperpaymentrate ip_rate outlays tot_outlays Mean_IP ip_dollars Weighted_mean_IP lb95_agen coef_agen ub95_agen lb95_prog coef_prog ub95_prog event date nhear_all program_id num_hearings num_hearing_years FY_of_first years_since_first num_obs program_outlays filename title hearing_id n_hearings n_agencies num_witness_agency agency_year"

* Find the intersection
local intersection
foreach var of local external_vars {
    if strpos(" `vars_in_memory' ", " `var' ") {
        local intersection `intersection' `var'
    }
}
* Keep intersection of variables.
keep `intersection'
* Label variables.
label variable agency "Federal agency"
label variable FY "Fiscal year"
label variable improperpaymentamountm "Improper payment $s in fiscal year"
label variable outlaysamountm  "Total outlay $s in fiscal year"
label variable agencyid "Agency id"
desc

* +++++++++++++++++++
* Export analysis data files.
* +++++++++++++++++++
compress
* CSV version.
export delimited using "$DATALOC/agencylevel.csv", replace
* Stata version.
save "$DATALOC/AgencyLevel.dta", replace
* tempfile version
tempfile agency
save `agency', replace
summ	




*********************************
****** PROGRAM-LEVEL MERGE ******
*********************************

use `ip', replace
destring outlaysamountm, force replace
destring improperpaymentamountm, force replace
* drop if it was NA for outlays and/or IP amount
drop if outlaysamountm==. | improperpaymentamountm==.
*gen check = improperpaymentamountm / outlaysamountm
collapse (sum) improperpaymentamountm outlaysamountm, by(fiscalyear agency programname programid)
* program-level IP rate
gen iprate = 100*improperpaymentamountm / outlaysamountm
label variable iprate "IP Rate (percent)"  
summ iprate

* get around error about strL format for programname
gen str programname2 = programname
replace programname = ""
compress programname
replace programname = programname2
drop programname2
describe programname
format %24s programname



* Merge agency-level hearings.
sort fiscalyear agency
joinby fiscalyear agency using `hearings_agency', unmatched(master)
tab _merge
keep if _merge==1 | _merge==3
drop _merge

* Set to zero non-merges.
foreach var of varlist agen_* last_agen_* last_agenXuni lX* {
	qui replace `var' = 0 if `var'==.
}

* merge in appropriations report data
merge m:1 fiscalyear agency using "$DATALOC/reports_ip_clean.dta"
drop if _merge==2
foreach var of varlist report report_ip2 report_first report_ip2_first reportXfirst report_ip2Xfirst {
	replace `var' = 0 if _merge==1
}
drop _merge


* Panel variables.
rename fiscalyear FY
encode agency, gen(agencyid)
sort agency programname FY
*order FY agency programname programid outlaysamountm improperpaymentamountm iprate hear* nhear* 
order FY agency programname programid outlaysamountm improperpaymentamountm iprate
* agency_year group id.
egen agency_year = group(agency FY)
tsset programid FY

* Lag values of appropriations reports
foreach var of varlist report report_ip2 {
	local vlabel: var label `var'
	qui g `var'_lag1 = L1.`var'
	local this_label = "`vlabel', t-1"
	label var `var'_lag1 "`this_label'"
}

* Lagged values of iprate (to look for pretrends)
gen iprate_lag1 = l1.iprate
gen iprate_lag2 = l2.iprate
gen iprate_lag3 = l3.iprate
* Log IP dollars.
qui g l1_log_ips = log(L1.improperpaymentamountm )
label var l1_log_ips "Log improper payment dollars, t-1"
gen log_ips = log(improperpaymentamountm)
label var log_ips "Log improper payment dollars"

order FY agency programname programid iprate*



* merge in hearing spillover
sort FY agency
merge m:1 FY agency using `hearings_agency_spill'
keep if _merge==1 | _merge==3
drop _merge
* Set to zero non-merges.
foreach var of varlist agen_hear_spill* last_agen_hear_spill {
	qui replace `var' = 0 if `var'==.
}

tempfile master
save `master', replace
* generate indicator for above/below 50th percentile program mean IP
collapse (mean) Mean_IP=iprate (sum) ip_dollars=improperpaymentamountm outlays=outlaysamountm if FY > 2004, by(programname)
sum Mean_IP, detail
* 50th percentile mean IP: 1.08
gen program_above50 = 1 if Mean_IP > 1.08
label var program_above50 "Program IP rate above 50th percentile"
replace program_above50 = 0 if program_above50==.
keep programname program_above50
sort programname
tempfile program_50
save `program_50', replace
use `master', replace
merge m:1 programname using `program_50'
drop _merge

qui g last_agen_2_plus = (last_agen_hear > 1 & last_agen_hear != .)
label variable last_agen_2_plus "Agency Hearing Prior to this Year"

* +++++++++++++++++++
* Determine intersection of current variables in memory and variables needed for analysis
* to produce smaller and simpler analysis data file.
* +++++++++++++++++++
* Retrieve the list of variables currently in memory
describe, varlist
local vars_in_memory `r(varlist)'

* Variables needed for analysis.
local external_vars "iprate agen_hear_all agen_nhear_all last_agen_hear agency FY agen_hear_h agen_hear_s last_agen_hear_h last_agen_hear_s agen_hearXuni last_agenXuni programname agen_nwords_all lXagen_nwords_last_hr agen_nwitnessagency_all lXagen_nwitnessagency_last_hr agencyid log_ip_dollars log_ip_rate L1_log_ip_rate L2_log_ip_rate L3_log_ip_rate L1_log_ip_dollars L2_log_ip_dollars L3_log_ip_dollars hearingXyear first_hearing has_had_hearing last_agen_2_plus agen_hear_all_lag1 agen_hear_all_lag2 agen_hear_all_lag3 agen_hear_all_lag4 programid agen_above50 program_above50 agen_prevsum_nhearb agen_hear_spill last_agen_hear_spill agen_hearXstatute agen_prop_all lXagen_prop_last_hr agen_ba agen_dba agen_nwitness_all lXagen_nwitness_last_hr agen_polappt_all lXagen_polappt_last_hr agen_subcom_all lXagen_subcom_last_hr agen_electionp_all lXagen_electionp_last_hr report report_ip2 reportXfirst report_first report_ip2Xfirst report_ip2_first report_lag1 report_ip2_lag1 fiscalyear outlaysamountm improperpaymentamountm improperpaymentrate ip_rate outlays tot_outlays Mean_IP ip_dollars Weighted_mean_IP lb95_agen coef_agen ub95_agen lb95_prog coef_prog ub95_prog event date nhear_all program_id num_hearings num_hearing_years FY_of_first years_since_first num_obs program_outlays filename title hearing_id n_hearings n_agencies num_witness_agency agency_year"

* Find the intersection
local intersection
foreach var of local external_vars {
    if strpos(" `vars_in_memory' ", " `var' ") {
        local intersection `intersection' `var'
    }
}
* Keep intersection of variables.
keep `intersection'
* Label variables.
label variable agency "Federal agency"
label variable agencyid "Agency id"
label variable FY "Fiscal year"
label variable improperpaymentamountm "Improper payment $s in fiscal year"
label variable outlaysamountm  "Total outlay $s in fiscal year"
label variable programname "Federal program name"
label variable programid "Program id"
label variable agency_year "Agency-fiscal year id"
desc

* +++++++++++++++++++
* Export analysis data files.
* +++++++++++++++++++
compress
* CSV version.
export delimited using "$DATALOC/programlevel.csv", replace
* Stata version.
save "$DATALOC/ProgramLevel.dta", replace
* tempfile version
tempfile payments
save `payments', replace
summ



